---Importing Necessary Files---
In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import f_oneway
---Making a DataFrame for the Dataset---
In [2]:
ukroadaccident = pd.read_csv('datasets\\uk_road_accident.csv')
---Checking if the DataFrame is Working---
In [3]:
ukroadaccident
Out[3]:
| Index | Accident_Severity | Accident Date | Latitude | Light_Conditions | District Area | Longitude | Number_of_Casualties | Number_of_Vehicles | Road_Surface_Conditions | Road_Type | Urban_or_Rural_Area | Weather_Conditions | Vehicle_Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 200701BS64157 | Serious | 5/6/2019 | 51.506187 | Darkness - lights lit | Kensington and Chelsea | -0.209082 | 1 | 2 | Dry | Single carriageway | Urban | Fine no high winds | Car |
| 1 | 200701BS65737 | Serious | 2/7/2019 | 51.495029 | Daylight | Kensington and Chelsea | -0.173647 | 1 | 2 | Wet or damp | Single carriageway | Urban | Raining no high winds | Car |
| 2 | 200701BS66127 | Serious | 26-08-2019 | 51.517715 | Darkness - lighting unknown | Kensington and Chelsea | -0.210215 | 1 | 3 | Dry | NaN | Urban | NaN | Taxi/Private hire car |
| 3 | 200701BS66128 | Serious | 16-08-2019 | 51.495478 | Daylight | Kensington and Chelsea | -0.202731 | 1 | 4 | Dry | Single carriageway | Urban | Fine no high winds | Bus or coach (17 or more pass seats) |
| 4 | 200701BS66837 | Slight | 3/9/2019 | 51.488576 | Darkness - lights lit | Kensington and Chelsea | -0.192487 | 1 | 2 | Dry | NaN | Urban | NaN | Other vehicle |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 660674 | 201091NM01760 | Slight | 18-02-2022 | 57.374005 | Daylight | Highland | -3.467828 | 2 | 1 | Dry | Single carriageway | Rural | Fine no high winds | Car |
| 660675 | 201091NM01881 | Slight | 21-02-2022 | 57.232273 | Darkness - no lighting | Highland | -3.809281 | 1 | 1 | Frost or ice | Single carriageway | Rural | Fine no high winds | Car |
| 660676 | 201091NM01935 | Slight | 23-02-2022 | 57.585044 | Daylight | Highland | -3.862727 | 1 | 3 | Frost or ice | Single carriageway | Rural | Fine no high winds | Car |
| 660677 | 201091NM01964 | Serious | 23-02-2022 | 57.214898 | Darkness - no lighting | Highland | -3.823997 | 1 | 2 | Wet or damp | Single carriageway | Rural | Fine no high winds | Motorcycle over 500cc |
| 660678 | 201091NM02142 | Serious | 28-02-2022 | 57.575210 | Daylight | Highland | -3.895673 | 1 | 1 | Wet or damp | Dual carriageway | Rural | Snowing no high winds | Car |
660679 rows × 14 columns
---Cleaning the Inconsistencies on the Accident date---
In [4]:
ukroadaccident['Accident Date'] = ukroadaccident['Accident Date'].astype('str')
ukroadaccident['Accident Date'] = ukroadaccident['Accident Date'].str.strip()
ukroadaccident['Accident Date'] = ukroadaccident['Accident Date'].str.replace('/', '-')
---Coverting the Data Type of the Accident Date---
In [5]:
ukroadaccident['Accident Date'] = pd.to_datetime(ukroadaccident['Accident Date'], dayfirst=True, errors='coerce')
---Extracting New Columns from Accident Date---
In [6]:
ukroadaccident['Year'] = ukroadaccident['Accident Date'].dt.year
ukroadaccident['Month_Number'] = ukroadaccident['Accident Date'].dt.month
ukroadaccident['Month'] = ukroadaccident['Accident Date'].dt.month_name()
ukroadaccident['Day'] = ukroadaccident['Accident Date'].dt.day
ukroadaccident['DayofWeek'] = ukroadaccident['Accident Date'].dt.dayofweek # Monday=0, Sunday=6
---Checking for Null Values---
In [7]:
ukroadaccident.isnull().sum()
Out[7]:
Index 0 Accident_Severity 0 Accident Date 0 Latitude 25 Light_Conditions 0 District Area 0 Longitude 26 Number_of_Casualties 0 Number_of_Vehicles 0 Road_Surface_Conditions 726 Road_Type 4520 Urban_or_Rural_Area 15 Weather_Conditions 14128 Vehicle_Type 0 Year 0 Month_Number 0 Month 0 Day 0 DayofWeek 0 dtype: int64
---Fixing the Null Values---
-Numerical Null Values-
In [8]:
ukroadaccident['Latitude'].mean()
Out[8]:
np.float64(52.553865761110956)
In [9]:
ukroadaccident['Latitude'] = ukroadaccident['Latitude'].fillna(ukroadaccident['Latitude'].mean())
---------------------------------------------
In [10]:
ukroadaccident['Longitude'].mean()
Out[10]:
np.float64(-1.431210368502073)
In [11]:
ukroadaccident['Longitude'] = ukroadaccident['Longitude'].fillna(ukroadaccident['Longitude'].mean())
-Categorical Null Values-
In [12]:
ukroadaccident['Road_Surface_Conditions'] = ukroadaccident['Road_Surface_Conditions'].fillna('unaccounted')
---------------------------------------------
In [13]:
ukroadaccident['Road_Type'] = ukroadaccident['Road_Type'].fillna('unaccounted')
---------------------------------------------
In [14]:
ukroadaccident['Urban_or_Rural_Area'].mode()
Out[14]:
0 Urban Name: Urban_or_Rural_Area, dtype: object
In [15]:
ukroadaccident['Urban_or_Rural_Area'] = ukroadaccident['Urban_or_Rural_Area'].fillna(ukroadaccident['Urban_or_Rural_Area'].mode()[0])
---------------------------------------------
In [16]:
ukroadaccident['Weather_Conditions'] = ukroadaccident['Weather_Conditions'].fillna('unaccounted')
---Checking if there are still Null Values---
In [17]:
ukroadaccident.isnull().sum()
Out[17]:
Index 0 Accident_Severity 0 Accident Date 0 Latitude 0 Light_Conditions 0 District Area 0 Longitude 0 Number_of_Casualties 0 Number_of_Vehicles 0 Road_Surface_Conditions 0 Road_Type 0 Urban_or_Rural_Area 0 Weather_Conditions 0 Vehicle_Type 0 Year 0 Month_Number 0 Month 0 Day 0 DayofWeek 0 dtype: int64
---Checking the Data Type---
In [18]:
ukroadaccident.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 660679 entries, 0 to 660678 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Index 660679 non-null object 1 Accident_Severity 660679 non-null object 2 Accident Date 660679 non-null datetime64[ns] 3 Latitude 660679 non-null float64 4 Light_Conditions 660679 non-null object 5 District Area 660679 non-null object 6 Longitude 660679 non-null float64 7 Number_of_Casualties 660679 non-null int64 8 Number_of_Vehicles 660679 non-null int64 9 Road_Surface_Conditions 660679 non-null object 10 Road_Type 660679 non-null object 11 Urban_or_Rural_Area 660679 non-null object 12 Weather_Conditions 660679 non-null object 13 Vehicle_Type 660679 non-null object 14 Year 660679 non-null int32 15 Month_Number 660679 non-null int32 16 Month 660679 non-null object 17 Day 660679 non-null int32 18 DayofWeek 660679 non-null int32 dtypes: datetime64[ns](1), float64(2), int32(4), int64(2), object(10) memory usage: 85.7+ MB
---Fixing the Data Type---
In [19]:
ukroadaccident['Index'] = ukroadaccident['Index'].astype('category')
ukroadaccident['Accident_Severity'] = ukroadaccident['Accident_Severity'].astype('category')
ukroadaccident['Light_Conditions'] = ukroadaccident['Light_Conditions'].astype('category')
ukroadaccident['District Area'] = ukroadaccident['District Area'].astype('category')
ukroadaccident['Road_Surface_Conditions'] = ukroadaccident['Road_Surface_Conditions'].astype('category')
ukroadaccident['Road_Type'] = ukroadaccident['Road_Type'].astype('category')
ukroadaccident['Urban_or_Rural_Area'] = ukroadaccident['Urban_or_Rural_Area'].astype('category')
ukroadaccident['Weather_Conditions'] = ukroadaccident['Weather_Conditions'].astype('category')
ukroadaccident['Vehicle_Type'] = ukroadaccident['Vehicle_Type'].astype('category')
ukroadaccident['Month'] = ukroadaccident['Month'].astype('category')
ukroadaccident['Year'] = ukroadaccident['Year'].astype('category')
---Checking again---
In [20]:
ukroadaccident.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 660679 entries, 0 to 660678 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Index 660679 non-null category 1 Accident_Severity 660679 non-null category 2 Accident Date 660679 non-null datetime64[ns] 3 Latitude 660679 non-null float64 4 Light_Conditions 660679 non-null category 5 District Area 660679 non-null category 6 Longitude 660679 non-null float64 7 Number_of_Casualties 660679 non-null int64 8 Number_of_Vehicles 660679 non-null int64 9 Road_Surface_Conditions 660679 non-null category 10 Road_Type 660679 non-null category 11 Urban_or_Rural_Area 660679 non-null category 12 Weather_Conditions 660679 non-null category 13 Vehicle_Type 660679 non-null category 14 Year 660679 non-null category 15 Month_Number 660679 non-null int32 16 Month 660679 non-null category 17 Day 660679 non-null int32 18 DayofWeek 660679 non-null int32 dtypes: category(11), datetime64[ns](1), float64(2), int32(3), int64(2) memory usage: 61.6 MB
---20 Questions and Insights---
~EDA~
1. What is the most usual vehicle type?
In [21]:
ukroadaccident['Vehicle_Type'].mode()
Out[21]:
0 Car Name: Vehicle_Type, dtype: category Categories (16, object): ['Agricultural vehicle', 'Bus or coach (17 or more pass seats)', 'Car', 'Data missing or out of range', ..., 'Pedal cycle', 'Ridden horse', 'Taxi/Private hire car', 'Van / Goods 3.5 tonnes mgw or under']
Insight: According to the result, the most frequent/usual type of vehicle involve in accidents is car.
2. What is the usual light condition during the accidents?
In [22]:
ukroadaccident['Light_Conditions'].mode()
Out[22]:
0 Daylight Name: Light_Conditions, dtype: category Categories (5, object): ['Darkness - lighting unknown', 'Darkness - lights lit', 'Darkness - lights unlit', 'Darkness - no lighting', 'Daylight']
Insight: The result shows that accidents are usually happening during daylight.
3. In terms of road surface conditions, what is the the most frequent?
In [23]:
ukroadaccident['Road_Surface_Conditions'].mode()
Out[23]:
0 Dry Name: Road_Surface_Conditions, dtype: category Categories (6, object): ['Dry', 'Flood over 3cm. deep', 'Frost or ice', 'Snow', 'Wet or damp', 'unaccounted']
Insight: Based on the result, dry road surface condition are prone to accidents.
4. How many number of vehicles are mostly being involved during accidents?
In [24]:
ukroadaccident['Number_of_Vehicles'].mode()
Out[24]:
0 2 Name: Number_of_Vehicles, dtype: int64
Insight: The result shows that two vehicles are mostly involved during road accidents.
5. What type of road does the accidents usually happen?
In [25]:
ukroadaccident['Road_Type'].mode()
Out[25]:
0 Single carriageway Name: Road_Type, dtype: category Categories (6, object): ['Dual carriageway', 'One way street', 'Roundabout', 'Single carriageway', 'Slip road', 'unaccounted']
Insight: According to the result, accidents usually happen on a single carriageway road.
6. How many record of accidents per year?
In [26]:
ukroadaccident['Year'].value_counts()
Out[26]:
Year 2019 182115 2020 170591 2021 163554 2022 144419 Name: count, dtype: int64
Insight 1: The accidents record is gradually decreasing year by year from 27.57% in 2019 to 21.84% in 2022.
Insight 2: The highest number of accidents is on 2019, followed by 2020 at 25.83%, followed by 2021 with 24.76%, then 2022 is the lowest at 21.84%.
7. In terms of weather conditions, when does the least and most accidents record?
In [27]:
ukroadaccident['Weather_Conditions'].value_counts()
Out[27]:
Weather_Conditions Fine no high winds 520885 Raining no high winds 79696 Other 17150 unaccounted 14128 Raining + high winds 9615 Fine + high winds 8554 Snowing no high winds 6238 Fog or mist 3528 Snowing + high winds 885 Name: count, dtype: int64
Insight 1: Based on the result, the majority of incidents (79%) happen under fine weather with no high winds weather condition. So, even a fine weather is not safe from road accidents.
Insight 2: Raining without high winds condition accounts for 12%, which is the second most condition.
Insight 3: The other conditions are 2% to below 1% rate, and the lowest is snowy with high winds at 0.13%lowest is snowy with high winds at 0.13%
Insight 4: There is a small portion of incidents classified as “Other” or “Unaccounted” (4.7% combined) which indicates the records under unknown conditions.
~Aggregation~
8. What is the total number of accidents per road type?
In [28]:
ukroadaccident.groupby('Road_Type').size()
Out[28]:
Road_Type Dual carriageway 99424 One way street 13559 Roundabout 43992 Single carriageway 492143 Slip road 7041 unaccounted 4520 dtype: int64
Insight 1: Single carriageways had the highest record of incidents with 74.55%, which means this type of road is accident-prone.
Insight 2: Dual carriageways is 15%, showing fewer incidents compared to single carriageways.
Insight 2: A very small portion of 0.7% is unaccounted, which is the lowest data but valuable still.
9. What is the average of number of casualties involved in accidents by accident severity?
In [29]:
ukroadaccident.groupby('Accident_Severity')['Number_of_Casualties'].mean()
Out[29]:
Accident_Severity Fatal 1.903129 Serious 1.467280 Slight 1.331402 Name: Number_of_Casualties, dtype: float64
Insight: The average of accident severity in terms of fatal is 1.903129 (highest), serious is 1.467280, and slight is 1.331402 (lowest).
10. How do accident severities vary across different light conditions?
In [30]:
ukroadaccident.groupby(['Accident_Severity', 'Light_Conditions']).size().unstack()
Out[30]:
| Light_Conditions | Darkness - lighting unknown | Darkness - lights lit | Darkness - lights unlit | Darkness - no lighting | Daylight |
|---|---|---|---|---|---|
| Accident_Severity | |||||
| Fatal | 68 | 1860 | 45 | 1612 | 5076 |
| Serious | 794 | 19130 | 360 | 7174 | 60759 |
| Slight | 5622 | 108345 | 2138 | 28651 | 419045 |
Insight: We can conclude that all the three types of accident severity happens the most during the daylight, while the three of them also happens the least during darkness - lights unlit.
11. What is the distribution of accidents by road type and road surface condition?
In [31]:
ukroadaccident.groupby(['Road_Type', 'Road_Surface_Conditions']).size().unstack()
Out[31]:
| Road_Surface_Conditions | Dry | Flood over 3cm. deep | Frost or ice | Snow | Wet or damp | unaccounted |
|---|---|---|---|---|---|---|
| Road_Type | ||||||
| Dual carriageway | 66205 | 302 | 2408 | 909 | 29533 | 67 |
| One way street | 10068 | 5 | 183 | 76 | 3195 | 32 |
| Roundabout | 30698 | 22 | 745 | 232 | 12209 | 86 |
| Single carriageway | 332698 | 672 | 14918 | 4585 | 138743 | 527 |
| Slip road | 4714 | 9 | 178 | 60 | 2074 | 6 |
| unaccounted | 3438 | 7 | 85 | 28 | 954 | 8 |
Insight: The result shows that all of the road types had the same highest number of accidents in terms of road surface conditions which is dry, they also had the same least number of accidents which is flood over 3cm. deep (road surface condition).
12. What type of area does accidents happen the most?
In [32]:
ukroadaccident.groupby('Urban_or_Rural_Area').size()
Out[32]:
Urban_or_Rural_Area Rural 238990 Unallocated 11 Urban 421678 dtype: int64
Insight 1: According to the result, accidents usually happen at Urban areas with 63.82%. On the other hand, Rural areas account for 36%, still a significant portion.
Insight 2: The Rural accidents record somehow half of the records in Urban areas, both are significant because if we only look at one a time, both are large numbers.
13. What month does accidents usually happens?
In [33]:
ukroadaccident.groupby('Month').size()
Out[33]:
Month April 51744 August 53913 December 51836 February 49491 January 52872 July 57445 June 56481 March 54086 May 56352 November 60424 October 59580 September 56455 dtype: int64
Insight 1: The result shows that accidents usually happen during the month of November with 9.15% and February (7.49%) has the lowest.
Insight 2: Most months are fairly close, around 7.5–8.7%, showing fairly consistent incident distribution across the year.
Insight 3: Summer months (June–August) account for roughly ~25% combined, indicating higher traffic or activity during that period.
Insight 4: The slight peaks in October(9.02%)–November(9.15%) could indicate weather changes, holidays, or increased traffic contributing to incidents.
14. What is the distribution of accidents by accident severity and vehicle type?
In [34]:
ukroadaccident.groupby(['Accident_Severity','Vehicle_Type']).size().unstack()
Out[34]:
| Vehicle_Type | Agricultural vehicle | Bus or coach (17 or more pass seats) | Car | Data missing or out of range | Goods 7.5 tonnes mgw and over | Goods over 3.5t. and under 7.5t | Minibus (8 - 16 passenger seats) | Motorcycle 125cc and under | Motorcycle 50cc and under | Motorcycle over 125cc and up to 500cc | Motorcycle over 500cc | Other vehicle | Pedal cycle | Ridden horse | Taxi/Private hire car | Van / Goods 3.5 tonnes mgw or under |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accident_Severity | ||||||||||||||||
| Fatal | 21 | 325 | 6577 | 0 | 216 | 67 | 29 | 189 | 95 | 105 | 339 | 70 | 6 | 0 | 155 | 467 |
| Serious | 282 | 3373 | 66461 | 0 | 2321 | 857 | 276 | 2031 | 1014 | 1014 | 3457 | 767 | 39 | 0 | 1771 | 4554 |
| Slight | 1644 | 22180 | 424954 | 6 | 14770 | 5172 | 1671 | 13049 | 6494 | 6537 | 21861 | 4800 | 152 | 4 | 11368 | 29139 |
Insight: According to the result, car is the vehicle type that got the highest accident of all the three types of accident severity, while ridden horse has the least.
~Correlation~
15. Is there a correlation between the number of casualties and number of vehicles?
In [35]:
ukroadaccident['Number_of_Casualties'].corr(ukroadaccident['Number_of_Vehicles'])
Out[35]:
np.float64(0.2288888612692756)
Insight: The result shows that number of casualties and number of vehicles has no correlation.
16. Is there a correlation betweem latitude number of casualties?
In [36]:
ukroadaccident['Latitude'].corr(ukroadaccident['Number_of_Casualties'])
Out[36]:
np.float64(0.032200686625906395)
Insight: Based on the result, there is no correlation between laltitude and number of casualties.
17. Is there a correlation between longitude and number of casualties?
In [37]:
ukroadaccident['Longitude'].corr(ukroadaccident['Number_of_Casualties'])
Out[37]:
np.float64(-0.0404056457884545)
Insight: The result shows that there is no correlation between longitude and number of casualties.
18. Is there a significant difference between accident severity and number of vehicles?
In [38]:
ukroadaccident['Accident_Severity'].unique()
Out[38]:
['Serious', 'Slight', 'Fatal'] Categories (3, object): ['Fatal', 'Serious', 'Slight']
In [39]:
vehiserious = ukroadaccident[ukroadaccident['Accident_Severity'] == 'Serious']['Number_of_Vehicles']
vehislight = ukroadaccident[ukroadaccident['Accident_Severity'] == 'Slight']['Number_of_Vehicles']
vehifatal = ukroadaccident[ukroadaccident['Accident_Severity'] == 'Fatal']['Number_of_Vehicles']
In [40]:
result, pvalue = f_oneway(vehiserious, vehislight, vehifatal)
pvalue
Out[40]:
np.float64(0.0)
Insight: The result shows that there is extremely significance between the accident severity and number of vehicles, it shows that they are very related to each other.
19. Is there a significant difference between the area and number of casualties?
In [41]:
ukroadaccident['Urban_or_Rural_Area'].unique()
Out[41]:
['Urban', 'Rural', 'Unallocated'] Categories (3, object): ['Rural', 'Unallocated', 'Urban']
In [42]:
urbancasualty = ukroadaccident[ukroadaccident['Urban_or_Rural_Area'] == 'Urban']['Number_of_Casualties']
ruralcasualty = ukroadaccident[ukroadaccident['Urban_or_Rural_Area'] == 'Rural']['Number_of_Casualties']
unallocatedcasualty = ukroadaccident[ukroadaccident['Urban_or_Rural_Area'] == 'Unallocated']['Number_of_Casualties']
In [43]:
result, pvalue = f_oneway(urbancasualty, ruralcasualty, unallocatedcasualty)
pvalue
Out[43]:
np.float64(0.0)
Insight: According to the result, there is a significant difference between the areas and the number of casualties.
20. Is there a significant difference between light conditions and latitude?
In [44]:
ukroadaccident['Light_Conditions'].unique()
Out[44]:
['Darkness - lights lit', 'Daylight', 'Darkness - lighting unknown', 'Darkness - lights unlit', 'Darkness - no lighting'] Categories (5, object): ['Darkness - lighting unknown', 'Darkness - lights lit', 'Darkness - lights unlit', 'Darkness - no lighting', 'Daylight']
In [45]:
lighta = ukroadaccident[ukroadaccident['Light_Conditions'] == 'Darkness - lights lit']['Latitude']
lightb = ukroadaccident[ukroadaccident['Light_Conditions'] == 'Daylight']['Latitude']
lightc = ukroadaccident[ukroadaccident['Light_Conditions'] == 'Darkness - lighting unknown']['Latitude']
lightd = ukroadaccident[ukroadaccident['Light_Conditions'] == 'Darkness - lights unlit']['Latitude']
lighte = ukroadaccident[ukroadaccident['Light_Conditions'] == 'Darkness - no lighting']['Latitude']
In [46]:
result, pvalue = f_oneway(lighta, lightb, lightc, lightd, lighte)
pvalue
Out[46]:
np.float64(1.224998791423201e-27)
Insight: The result shows that the difference between light conditions and latitude is very significant, mean the light conditions has an extreme effect on the latitude.
---Additional Five Questions about Accident Date---
-UniVariate-
21. Which day of the week had the most accidents record?
In [47]:
ukroadaccident['DayofWeek'].value_counts()
Out[47]:
DayofWeek 5 107178 2 99558 3 99511 4 97900 1 94550 6 89302 0 72680 Name: count, dtype: int64
Insight: Based on the result, accidents mostly occurs on Saturday, and accidents happen the least during Mondays.
-BiVariate-
22. What is the number of accidents per month in each year?
In [48]:
accidentpermonth = ukroadaccident.groupby(['Month', 'Year']).size()
accidentpermonth.unstack()
Out[48]:
| Year | 2019 | 2020 | 2021 | 2022 |
|---|---|---|---|---|
| Month | ||||
| April | 14125 | 13394 | 12715 | 11510 |
| August | 15044 | 13366 | 13415 | 12088 |
| December | 14708 | 13794 | 13709 | 9625 |
| February | 13253 | 14353 | 10950 | 10935 |
| January | 15355 | 14133 | 13417 | 9967 |
| July | 15862 | 14630 | 14300 | 12653 |
| June | 15528 | 14205 | 13936 | 12812 |
| March | 15049 | 13494 | 13202 | 12341 |
| May | 15833 | 14336 | 13811 | 12372 |
| November | 16559 | 14770 | 15473 | 13622 |
| October | 15528 | 15684 | 14834 | 13534 |
| September | 15271 | 14432 | 13792 | 12960 |
Insight 1: Based on their percentage equivalence, November is consistenly the peak month of accidents across all four years with two consequtive above 9% rate (2021, and 2022). Hoevwer, October also has high percentage with three ratings of 9% and above but November slightly beats it in most years.
Insight 2: February has the lowest percentage of accidents especially 2021 at just 6.7%. It suggests that road accidents tend to be more common in October, while February is generally a safer month.
Insight 3: Most of the months had high road accident rate on the year 2019, and we could see from the numbers that on the years 2020-2022, the number had decreased and we can conclude that this is perhaps because of the pandemic.
23. What is the state of accident severity across years?
In [49]:
ukroadaccident.groupby(['Accident_Severity', 'Year']).size().unstack()
Out[49]:
| Year | 2019 | 2020 | 2021 | 2022 |
|---|---|---|---|---|
| Accident_Severity | ||||
| Fatal | 2714 | 2341 | 2057 | 1549 |
| Serious | 24322 | 23121 | 21997 | 18777 |
| Slight | 155079 | 145129 | 139500 | 124093 |
Insight 1: From the number itself, it shows that deadly crashes are rare compared to other types of accident severity. Fatal accidents are only around 1% of all cases. They even decreased slightly over the years, from 1.49% in 2019 down to 1.07% in 2022.
Insight 2: On the other hand, slight accidents had the highest consistent record at around 85% every year.
Insight 3: Seious accidents are in the middle of the percentage, staying between 13–14%.
Insight 4: Accidents across the years were slight and less harmful, a small portion are serious, and very few cases are fatal.
-MultiVariate-
24. What is the average casualties per month, year, and road surface conditions?
In [50]:
ukroadaccident.groupby(['Month', 'Year', 'Road_Surface_Conditions'])['Number_of_Casualties'].mean().unstack()
Out[50]:
| Road_Surface_Conditions | Dry | Flood over 3cm. deep | Frost or ice | Snow | Wet or damp | unaccounted | |
|---|---|---|---|---|---|---|---|
| Month | Year | ||||||
| April | 2019 | 1.360358 | 1.000000 | 1.200000 | NaN | 1.385888 | 1.000000 |
| 2020 | 1.339197 | 1.538462 | 1.560606 | 1.442308 | 1.442780 | 1.000000 | |
| 2021 | 1.353704 | 1.000000 | 1.000000 | 2.000000 | 1.420137 | 1.181818 | |
| 2022 | 1.357578 | 1.000000 | 1.500000 | 1.250000 | 1.485714 | 1.181818 | |
| August | 2019 | 1.383017 | 1.636364 | 1.000000 | 1.000000 | 1.458948 | 1.304348 |
| 2020 | 1.375027 | 1.578947 | 1.000000 | 2.000000 | 1.450013 | 1.304348 | |
| 2021 | 1.385929 | 2.090909 | NaN | 1.000000 | 1.463757 | 1.000000 | |
| 2022 | 1.379569 | 1.750000 | 1.500000 | 2.000000 | 1.415216 | 1.400000 | |
| December | 2019 | 1.286649 | 1.531915 | 1.350850 | 1.181818 | 1.422541 | 1.000000 |
| 2020 | 1.335708 | 1.708333 | 1.345857 | 1.252427 | 1.401117 | 1.142857 | |
| 2021 | 1.274566 | 1.500000 | 1.380342 | 1.374680 | 1.388427 | 1.461538 | |
| 2022 | 1.297656 | 1.600000 | 1.385561 | 1.376316 | 1.415217 | 1.125000 | |
| February | 2019 | 1.315298 | 1.083333 | 1.287305 | 1.312000 | 1.401046 | 1.133333 |
| 2020 | 1.318313 | 1.600000 | 1.346648 | 1.661017 | 1.383009 | 1.071429 | |
| 2021 | 1.326901 | 1.176471 | 1.364519 | 1.303318 | 1.362491 | 1.272727 | |
| 2022 | 1.325941 | 1.714286 | 1.317593 | 1.402632 | 1.388079 | 1.187500 | |
| January | 2019 | 1.301965 | 1.358974 | 1.331887 | 1.278571 | 1.403802 | 1.150000 |
| 2020 | 1.305425 | 1.515625 | 1.308008 | 1.301887 | 1.369336 | 1.217391 | |
| 2021 | 1.331680 | 1.333333 | 1.315061 | 1.346535 | 1.387118 | 1.357143 | |
| 2022 | 1.285207 | 1.111111 | 1.336806 | 1.334642 | 1.347077 | 1.222222 | |
| July | 2019 | 1.342417 | 1.538462 | 1.000000 | 1.400000 | 1.459278 | 1.444444 |
| 2020 | 1.324689 | 1.583333 | 1.000000 | 1.000000 | 1.445199 | 1.500000 | |
| 2021 | 1.340411 | 1.444444 | 1.000000 | NaN | 1.474479 | 1.066667 | |
| 2022 | 1.339222 | 1.750000 | 1.000000 | 1.000000 | 1.470771 | 1.000000 | |
| June | 2019 | 1.336180 | 1.506667 | 1.000000 | 1.000000 | 1.447362 | 1.117647 |
| 2020 | 1.333754 | 1.166667 | 1.000000 | NaN | 1.416375 | 1.076923 | |
| 2021 | 1.329111 | 1.555556 | NaN | 1.333333 | 1.442604 | 1.714286 | |
| 2022 | 1.339122 | 2.000000 | 1.000000 | NaN | 1.399510 | 1.000000 | |
| March | 2019 | 1.330359 | 1.294118 | 1.231579 | 1.411111 | 1.435946 | 1.062500 |
| 2020 | 1.333562 | 1.352941 | 1.452555 | 1.352381 | 1.405273 | 1.450000 | |
| 2021 | 1.334661 | 1.200000 | 1.346939 | 1.678571 | 1.413530 | 1.250000 | |
| 2022 | 1.325219 | 1.333333 | 1.246914 | 1.500000 | 1.411477 | 1.227273 | |
| May | 2019 | 1.346532 | 1.684211 | 1.000000 | 1.000000 | 1.442656 | 1.100000 |
| 2020 | 1.337205 | 1.400000 | 1.500000 | 1.333333 | 1.457971 | 1.352941 | |
| 2021 | 1.354493 | 1.666667 | 1.500000 | 1.333333 | 1.417683 | 1.071429 | |
| 2022 | 1.338688 | 1.250000 | 1.333333 | 1.000000 | 1.500753 | 1.250000 | |
| November | 2019 | 1.290865 | 1.260870 | 1.369295 | 1.428571 | 1.400236 | 1.043478 |
| 2020 | 1.300952 | 1.583333 | 1.434132 | 1.384615 | 1.387696 | 1.200000 | |
| 2021 | 1.294445 | 1.452381 | 1.243243 | 1.923077 | 1.393972 | 1.421053 | |
| 2022 | 1.318023 | 1.444444 | 1.356423 | 1.371429 | 1.389572 | 1.142857 | |
| October | 2019 | 1.330498 | 1.272727 | 1.400000 | 1.000000 | 1.393419 | 1.153846 |
| 2020 | 1.314318 | 1.518519 | 1.347150 | 1.485714 | 1.397198 | 1.090909 | |
| 2021 | 1.324756 | 1.533333 | 2.200000 | 1.000000 | 1.426586 | 1.076923 | |
| 2022 | 1.336587 | 1.705882 | 1.188679 | 1.714286 | 1.388816 | 1.210526 | |
| September | 2019 | 1.339429 | 1.250000 | 1.000000 | 1.500000 | 1.435417 | 1.307692 |
| 2020 | 1.323790 | 1.509804 | 1.000000 | 1.000000 | 1.395795 | 1.083333 | |
| 2021 | 1.328964 | 2.111111 | NaN | 1.500000 | 1.401223 | 1.411765 | |
| 2022 | 1.332019 | 1.400000 | 1.000000 | 2.000000 | 1.408525 | 1.636364 |
Insight 1: The average casualties are slightly higher on wet/damp roads (around 1.40–1.47) compared to dry roads (around 1.30–1.36) across almost months and years.
Insight 2: The "Flood over 3cm deep" condition often shows averages above 1.5 and sometimes above 2.0 casualties per accident.
Averages of frost/ice and snow condition variesfrost/ice and snow condition varies a lot, both of them has usually around 1.0 average but there are times that it spikes to above 2.0 (October 2021 has the highest value with 2.2).
Insight 4: Most accidents happen on dry roads but accidents on wet and flooded roads are a bit more dangerous. Snow and ice road surface condition tend to happen less frequent but it is the most dangerous and causes more casualties when they do happen.
25. What is the distribution of accidents by year, month, and rural/urban area?
In [51]:
ukroadaccident.groupby(['Urban_or_Rural_Area','Year', 'Month']).size().unstack()
Out[51]:
| Month | April | August | December | February | January | July | June | March | May | November | October | September | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Urban_or_Rural_Area | Year | ||||||||||||
| Rural | 2019 | 5169 | 6001 | 5499 | 4874 | 5807 | 6047 | 5835 | 5350 | 5802 | 5974 | 5518 | 5682 |
| 2020 | 4752 | 5204 | 5167 | 5226 | 5182 | 5415 | 5105 | 4947 | 5184 | 5251 | 5415 | 5143 | |
| 2021 | 4555 | 5256 | 5106 | 4144 | 4925 | 5277 | 4943 | 4391 | 4873 | 5456 | 5260 | 4830 | |
| 2022 | 3956 | 4537 | 3589 | 3896 | 3626 | 4476 | 4317 | 4118 | 4266 | 4671 | 4593 | 4380 | |
| Unallocated | 2019 | 0 | 0 | 2 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
| 2020 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 2 | 0 | 0 | 0 | |
| 2021 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
| 2022 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | |
| Urban | 2019 | 8956 | 9043 | 9207 | 8379 | 9547 | 9814 | 9692 | 9699 | 10031 | 10585 | 10010 | 9589 |
| 2020 | 8641 | 8162 | 8626 | 9127 | 8951 | 9215 | 9099 | 8546 | 9150 | 9519 | 10269 | 9289 | |
| 2021 | 8160 | 8159 | 8603 | 6806 | 8492 | 9023 | 8993 | 8811 | 8938 | 10017 | 9574 | 8962 | |
| 2022 | 7554 | 7551 | 6036 | 7039 | 6341 | 8177 | 8495 | 8223 | 8106 | 8951 | 8941 | 8580 |
Insight 1: In all months and years, accidents mostly happens in urban areas at 76%-80%. While accidents in rural areas are lower with 19%-23%.
Insight 2: The highest percentage in urban areas was in November 2021 at 80.16%, followed by both September 2021 and October 2020 at 80.15%. While the lowest was April 2019 (76.56%).
Insight 3: In rural areas, the lowest percentage was 19.84% on November 2021, while the highest was on April 2019 at 23.44%.
Insight 4: Most accidents happen in urban areas (cities) regardless of the month and year. Rural accidents are fewer but the still have a peak month (April).
---Additional questions with matplotlib---
In [52]:
import matplotlib.pyplot as plt
In [53]:
ukroadaccident.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 660679 entries, 0 to 660678 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Index 660679 non-null category 1 Accident_Severity 660679 non-null category 2 Accident Date 660679 non-null datetime64[ns] 3 Latitude 660679 non-null float64 4 Light_Conditions 660679 non-null category 5 District Area 660679 non-null category 6 Longitude 660679 non-null float64 7 Number_of_Casualties 660679 non-null int64 8 Number_of_Vehicles 660679 non-null int64 9 Road_Surface_Conditions 660679 non-null category 10 Road_Type 660679 non-null category 11 Urban_or_Rural_Area 660679 non-null category 12 Weather_Conditions 660679 non-null category 13 Vehicle_Type 660679 non-null category 14 Year 660679 non-null category 15 Month_Number 660679 non-null int32 16 Month 660679 non-null category 17 Day 660679 non-null int32 18 DayofWeek 660679 non-null int32 dtypes: category(11), datetime64[ns](1), float64(2), int32(3), int64(2) memory usage: 61.6 MB
26. What is the accident distribution by severity at Birmingham (highest accident record district area)?
In [54]:
birmingham = ukroadaccident[ukroadaccident['District Area'] == 'Birmingham']
accidentbirmingham = birmingham.groupby(['Accident_Severity', 'Light_Conditions']).size()
accidentbirmingham
Out[54]:
Accident_Severity Light_Conditions
Fatal Darkness - lighting unknown 2
Darkness - lights lit 47
Darkness - lights unlit 3
Darkness - no lighting 0
Daylight 53
Serious Darkness - lighting unknown 2
Darkness - lights lit 483
Darkness - lights unlit 3
Darkness - no lighting 6
Daylight 980
Slight Darkness - lighting unknown 78
Darkness - lights lit 3142
Darkness - lights unlit 45
Darkness - no lighting 13
Daylight 8634
dtype: int64
In [55]:
accidentbirmingham.plot(kind='bar', stacked=True, figsize=(10,6))
plt.title("Accidents in Birmingham by Severity & Light Conditions")
plt.xlabel("Light Conditions")
plt.ylabel("Number of Accidents")
plt.legend(title="Accident Severity")
plt.show()
Insight: The illustration shows that most slight severity accidents happen during daylight in Birmingham. Followed by slight accidents in darkness- light lit. Lastly, there's a small portion of serious accidents during daylight.
27. What is the car accident distribution by severity per year and month?
In [56]:
car = ukroadaccident[ukroadaccident['Vehicle_Type'] == 'Car']
grouped = car.groupby(['Year', 'Accident_Severity']).size().reset_index(name='Count')
grouped.columns = ['Year', 'Accident_Severity', 'Count']
grouped
Out[56]:
| Year | Accident_Severity | Count | |
|---|---|---|---|
| 0 | 2019 | Fatal | 1948 |
| 1 | 2019 | Serious | 17469 |
| 2 | 2019 | Slight | 110887 |
| 3 | 2020 | Fatal | 1772 |
| 4 | 2020 | Serious | 17384 |
| 5 | 2020 | Slight | 108738 |
| 6 | 2021 | Fatal | 1616 |
| 7 | 2021 | Serious | 17143 |
| 8 | 2021 | Slight | 109267 |
| 9 | 2022 | Fatal | 1241 |
| 10 | 2022 | Serious | 14465 |
| 11 | 2022 | Slight | 96062 |
In [57]:
pivoted = grouped.pivot(index='Year', columns='Accident_Severity', values='Count')
In [58]:
pivoted.plot(kind='line', marker='o', figsize=(10,6))
plt.title("Car Accident Distribution by Severity per Year")
plt.xlabel("Year")
plt.ylabel("Number of Car Accidents")
plt.legend(title="Severity")
plt.show()
Insight: As we can see from the line graph, there is a huge gap between slight severity and the other two severities. Fatality rate is the lowest of them all. Serious car accidents are around 15,000 number of accidents. While slight severity car accidents are on the top with above 100k records across 2019-2021, and it dropped a bit in 2022.
28. What is the accident distribution by road surface conditions per areas (rural, urban)?
In [59]:
surfacearea = ukroadaccident.groupby(['Urban_or_Rural_Area', 'Road_Surface_Conditions']).size().unstack()
surfacearea.plot(kind='bar', figsize=(10,6))
plt.title("Accidents by Road Surface Conditions and Area Type")
plt.ylabel("Number of Accidents")
plt.xlabel("Area (Urban / Rural)")
plt.legend(title="Road Surface Conditions")
plt.show()
Insight: Both rural and urban have both types of condition rate in terms of road surface. However, Urban got the highest number of accidents on dry or wet/damp surface conditions and on the other hand, Rural is much higher in terms of number of accidents on frost/ice and snow condition.
29. Comparing the weather condtions among the three highest disctrict areas and lowest area.
In [60]:
ukroadaccident['District Area'].value_counts().head(3)
Out[60]:
District Area Birmingham 13491 Leeds 8898 Manchester 6720 Name: count, dtype: int64
In [61]:
ukroadaccident['District Area'].value_counts().tail(1)
Out[61]:
District Area Clackmannanshire 91 Name: count, dtype: int64
In [62]:
weatherbirmingham = ukroadaccident[ukroadaccident['District Area'] == 'Birmingham']
birmiweather = weatherbirmingham['Weather_Conditions'].value_counts()
birmiweather.plot(kind='line', marker='o', figsize=(10,6))
plt.title("Weather condition in Birmingham")
plt.xlabel("Weather Conditions")
plt.ylabel("Number of Accidents")
plt.xticks(rotation=45)
plt.show()
In [63]:
weatherleeds = ukroadaccident[ukroadaccident['District Area'] == 'Leeds']
leedsweather = weatherleeds['Weather_Conditions'].value_counts()
leedsweather.plot(kind='line', marker='o', figsize=(10,6))
plt.title("Weather condition in Leeds")
plt.xlabel("Weather Conditions")
plt.ylabel("Number of Accidents")
plt.xticks(rotation=45)
plt.show()
In [64]:
weathermanchester = ukroadaccident[ukroadaccident['District Area'] == 'Manchester']
manchesterweather = weathermanchester['Weather_Conditions'].value_counts()
manchesterweather.plot(kind='line', marker='o', figsize=(10,6))
plt.title("Weather condition in Manchester")
plt.xlabel("Weather Conditions")
plt.ylabel("Number of Accidents")
plt.xticks(rotation=45)
plt.show()
In [65]:
weatherclack = ukroadaccident[ukroadaccident['District Area'] == 'Clackmannanshire']
clackweather = weatherclack['Weather_Conditions'].value_counts()
clackweather.plot(kind='line', marker='o', figsize=(10,6))
plt.title("Weather condition in Clackmannanshire")
plt.xlabel("Weather Conditions")
plt.ylabel("Number of Accidents")
plt.xticks(rotation=45)
plt.show()
Insight 1: It looks like there isnt much difference but if we really analyze it we can see their difference.
Insight 2: First, all four districts has the same type of weather condtion (free no high winds) as the highest number of incidents(regardless of the total number of incidents per district) , and "snowy without winds" is the lowest.
Insight 3: Raining without high winds is the second most condition among the four districts too.
Insight 4: From the third to eighth conditions across all charts now varies.
In [66]:
#look for the data without the the outlier(fine no high winds) in all four district above.
---SIX DISTRICT AREA ANALYSIS COMPARED TO NATIONAL TRENDS---
In [67]:
import folium
from folium.plugins import HeatMap
-National Analysis-
In [68]:
import matplotlib.pyplot as plt
# 1. Group by Year FIRST, then Month (critical order!)
monthly_data = ukroadaccident.groupby(['Year', 'Month']).size()
# 2. Unstack Year to create separate lines for each year
monthly_data = monthly_data.unstack(level=0)
# 3. Plot with clean formatting (only 4 lines!)
monthly_data.plot(
marker='o',
figsize=(10, 5),
title='Monthly Road Accidents by Year (2019-2022)'
)
plt.xticks(range(0, 12), ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
plt.legend(title='Year and Month Severity Comparison', bbox_to_anchor = (1.05,1), loc='upper left')
plt.grid(alpha=0.3)
plt.show()
-National Seasonal Analysis-
In [69]:
# Accidents per year
accidents_per_year = ukroadaccident['Year'].value_counts().sort_index()
accidents_per_month = ukroadaccident['Month_Number'].value_counts().sort_index()
accidents_per_day = ukroadaccident['DayofWeek'].value_counts().sort_index()
#PER YEAR
accidents_per_year.plot(kind='bar')
plt.title('Accidents per Year')
plt.xlabel('Year')
plt.ylabel('Number of Accidents')
plt.show()
month_names = {
1: 'January', 2: 'February', 3: 'March', 4: 'April',
5: 'May', 6: 'June', 7: 'July', 8: 'August',
9: 'September', 10: 'October', 11: 'November', 12: 'December'
}
# Replace numerical months with their names
accidents_per_month.index = accidents_per_month.index.astype(int).map(month_names)
accidents_per_month.plot(kind='bar')
plt.title('Accidents per Month')
plt.xlabel('Month')
plt.ylabel('Number of Accidents')
plt.show()
#accident per day
day_names = {
0: 'Monday',
1: 'Tuesday',
2: 'Wednesday',
3: 'Thursday',
4: 'Friday',
5: 'Saturday',
6: 'Sunday'
}
accidents_per_day.index = accidents_per_day.index.map(day_names)
accidents_per_day.plot(kind='bar')
print(accidents_per_day)
DayofWeek Monday 72680 Tuesday 94550 Wednesday 99558 Thursday 99511 Friday 97900 Saturday 107178 Sunday 89302 Name: count, dtype: int64
Accident Severity
In [70]:
severity_counts = ukroadaccident['Accident_Severity'].value_counts()
plt.pie(severity_counts,
labels = severity_counts.index,
autopct = '%1.1f%%',
startangle = 90)
plt.gcf().set_size_inches(10,7)
plt.title('Accident Severity Distribution')
plt.show()
severity_2019 = ukroadaccident[ukroadaccident['Year'] == 2019]
# severity_2019
sc_2019 = severity_2019['Accident_Severity'].value_counts()
plt.pie(sc_2019,
labels = sc_2019.index,
autopct = '%1.1f%%',
startangle = 90)
plt.gcf().set_size_inches(10,7)
plt.title('Accident Severity Distribution')
plt.show()
In [140]:
import folium
from folium.plugins import HeatMap
# ukmap = folium.Map(location=[54, -2], zoom_start=6)
# for idx, row in ukroadaccident.iterrows():
# folium.Marker([row['Latitude'], row['Longitude']]).add_to(ukmap)
# ukmap.save('Accident_map.html')
uk = list(zip(ukroadaccident['Latitude'], ukroadaccident['Longitude']))
m = folium.Map(location=[ukroadaccident['Latitude'].mean(), ukroadaccident['Longitude'].mean()], zoom_start=10)
HeatMap(uk).add_to(m)
m
# m.save('Aberdeen_heatmap.html')
Out[140]:
Make this Notebook Trusted to load map: File -> Trust Notebook